Library¶
import pandas as pd
import numpy as np
import PreProcessingText as ppt
from collections import Counter, defaultdict
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx
import matplotlib.pyplot as plt
import squarify
from transformers import pipeline
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from keybert import KeyBERT
from umap import UMAP
import hdbscan
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import csv
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.cluster import hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
4° Approach: BERTopic¶
Baseline Summary¶
Clustering Approach¶
- Parameter Setting: A high parameter was set for HDBSCAN to ensure well-defined clusters that occupy a significant percentage of the total dataset. This baseline is intended to feed machine learning algorithms for prediction purposes.
Initial Clustering Results¶
Clusters Retrieved: 7 representative clusters were identified:
- Drug sales
- Bitcoin
- Scammers and seller reviews
- Marketplace advertising
- Purchase reviews
- Drug purchases
- Orders
Outliers: Initially, 34k outliers were found out of a total of 66k records.
Performance Metrics:
- Silhouette Score: 0.64
- Davies-Bouldin Score: 0.6
Outlier Reduction¶
Cosine Measure on Embeddings: By applying a cosine similarity measure with a 0.53 threshold, the number of outliers was reduced from 34k to 27k, reintroducing about 7k records.
Updated Performance Metrics:
- Silhouette Score: 0.51
- Davies-Bouldin Score: 0.8
Trade-off Analysis¶
Outlier Reintroduction: Reintroducing the outliers found a balance that prevented significant cluster degradation while keeping clusters well separated and defined, as evidenced by the graphs.
Cluster Distribution: The updated clusters are well-distributed:
- Maximum cluster size: 23% of the total dataset
- Minimum cluster size: 7% of the total dataset
- This distribution avoids large excursions.
Data Loss and Potential Adjustments¶
Data Loss: Approximately 40% of the initial dataset was lost.
Potential Correction: This data loss can potentially be mitigated by lowering the cosine similarity threshold between embeddings.
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
66735
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
len(tc1.corpus), len(tc1.corpus_embeddings)
seed_topic_list = [[
'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]
zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]
representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=1200, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
language='multilingual',
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
seed_topic_list=seed_topic_list,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
zeroshot_topic_list=zeroshot_topic_list,
zeroshot_min_similarity=.05,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 34449 -1_new_free_ticket_help
1 0 7495 0_weed_xanax_cocaine_coke
2 1 6093 1_market_dream_empire_nightmare
3 2 5034 2_vendor_scammer_scam_scamming
4 3 4087 3_review_vendor_feedback_mdma
5 4 4003 4_mdma_lsd_shit_whats
6 5 2402 5_order_package_delivery_shipping
7 6 1966 6_bitcoin_card_wallet_credit
Representation \
0 [new, free, ticket, help, update, account, mdm...
1 [weed, xanax, cocaine, coke, ketamine, mg, can...
2 [market, dream, empire, nightmare, vendor, wal...
3 [vendor, scammer, scam, scamming, exit, scamme...
4 [review, vendor, feedback, mdma, mg, sample, r...
5 [mdma, lsd, shit, whats, fuck, gone, got, guy,...
6 [order, package, delivery, shipping, tracking,...
7 [bitcoin, card, wallet, credit, coin, carding,...
Representative_Docs
0 [canadianflavor weed shatter cbd edible hash c...
1 [high quality weed thc product europe, new xan...
2 [next market, dream market vendor rstclass nig...
3 [looking good reliable vendor sell ounce, vend...
4 [empire vendor cocaine review, first ever revi...
5 [hey ro im gon na pull pk, life wonderful life...
6 [order accepted day still hasnt marked shipped...
7 [credit cards paypal prepaid card find, got cc...
Topic 0:
[('weed', 0.5972313505812425), ('xanax', 0.5664832282989213), ('cocaine', 0.5350787342936356), ('coke', 0.4710111701375004), ('ketamine', 0.46985128023380035), ('mg', 0.46256209204548415), ('cannabis', 0.41853925594172725), ('drug', 0.4053330171594432), ('pill', 0.3907822559981816), ('quality', 0.38621568363790615)]
Topic 1:
[('market', 0.892430998800942), ('dream', 0.6865843677324943), ('empire', 0.6830028029033173), ('nightmare', 0.5681939396872522), ('vendor', 0.34305231363817884), ('wall', 0.3245499595042113), ('marketplace', 0.319921898437173), ('scam', 0.2961241301762431), ('exit', 0.2960733863924834), ('link', 0.2915460778160393)]
Topic 2:
[('vendor', 0.6950361459297074), ('scammer', 0.6725026815231682), ('scam', 0.4980623980369779), ('scamming', 0.46575246018365657), ('exit', 0.44160475610894967), ('scammed', 0.40051759892624533), ('looking', 0.37884048200047027), ('warning', 0.37715463753082534), ('reliable', 0.37144259341974245), ('buyer', 0.3708904841304073)]
Topic 3:
[('review', 1.002255217202406), ('vendor', 0.5076272530565451), ('feedback', 0.4049037794348937), ('mdma', 0.381329954044546), ('mg', 0.37619091451980585), ('sample', 0.3754397070467268), ('reviews', 0.3504300951320543), ('lsd', 0.3465899767001684), ('opinion', 0.3303160657068881), ('xanax', 0.33022254366369147)]
Topic 4:
[('mdma', 0.38275973612659386), ('lsd', 0.3779572278615291), ('shit', 0.35340590919386444), ('whats', 0.34834774258692336), ('fuck', 0.3264035078860319), ('gone', 0.31797094824590016), ('got', 0.3167851762249627), ('guy', 0.3153758862961693), ('dead', 0.31361936874635366), ('going', 0.3042237209259171)]
Topic 5:
[('order', 0.9350712100343167), ('package', 0.6655706541276237), ('delivery', 0.562721266995139), ('shipping', 0.527231820138037), ('tracking', 0.5122872117651205), ('shipped', 0.48839280205239965), ('ordering', 0.4784769909883374), ('cancelled', 0.47119974969542505), ('pack', 0.4566507281813944), ('delivered', 0.45351148583756845)]
Topic 6:
[('bitcoin', 0.8235475804294793), ('card', 0.7734286502423073), ('wallet', 0.6772588642347616), ('credit', 0.6731588060336892), ('coin', 0.5703668040987371), ('carding', 0.5529443276986676), ('btc', 0.5121844608207589), ('cash', 0.5037356917020909), ('debit', 0.500260454896595), ('coinbase', 0.49454000630077194)]
Topic -1:
[('new', 0.28398750337326484), ('free', 0.2771677713524054), ('ticket', 0.2699448449851029), ('help', 0.2697705189262906), ('update', 0.2675394807401724), ('account', 0.26547262677161937), ('mdma', 0.2638718211547908), ('vendor', 0.2588459510247759), ('dispute', 0.25440435619535773), ('need', 0.2488688355528112)]
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6388389468193054 Davies_bouldin_score: 0.5523262827209047
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
plt.figure(figsize=(10, 5))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar()
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.get_topic_freq()
| Topic | Count | |
|---|---|---|
| 0 | -1 | 34449 |
| 5 | 0 | 7495 |
| 2 | 1 | 6093 |
| 1 | 2 | 5034 |
| 7 | 3 | 4087 |
| 6 | 4 | 4003 |
| 3 | 5 | 2402 |
| 4 | 6 | 1966 |
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 34449 -1_vendor_new_free_help
1 0 7495 0_weed_vendor_xanax_mg
2 1 6093 1_market_empire_dream_vendor
3 2 5034 2_vendor_scammer_scam_looking
4 3 4087 3_review_vendor review_vendor_review vendor
5 4 4003 4_mdma_lsd_good_got
6 5 2402 5_order_package_shipping_delivery
7 6 1966 6_card_bitcoin_wallet_credit
Representation \
0 [vendor, new, free, help, best, account, uk, u...
1 [weed, vendor, xanax, mg, cocaine, uk, best, c...
2 [market, empire, dream, vendor, nightmare, dre...
3 [vendor, scammer, scam, looking, scamming, exi...
4 [review, vendor review, vendor, review vendor,...
5 [mdma, lsd, good, got, shit, whats, guy, fuck,...
6 [order, package, shipping, delivery, vendor, p...
7 [card, bitcoin, wallet, credit, btc, carding, ...
Representative_Docs
0 [canadianflavor weed shatter cbd edible hash c...
1 [high quality weed thc product europe, new xan...
2 [next market, dream market vendor rstclass nig...
3 [looking good reliable vendor sell ounce, vend...
4 [empire vendor cocaine review, first ever revi...
5 [hey ro im gon na pull pk, life wonderful life...
6 [order accepted day still hasnt marked shipped...
7 [credit cards paypal prepaid card find, got cc...
Topic 0:
[('weed', 0.02425497350614531), ('vendor', 0.021978341010015688), ('xanax', 0.02077949072716719), ('mg', 0.01948517638840499), ('cocaine', 0.018417804414484252), ('uk', 0.015046793957699879), ('best', 0.013425752943917355), ('coke', 0.012717130457267087), ('ketamine', 0.01175969464362258), ('cannabis', 0.010948216683877144)]
Topic 1:
[('market', 0.09008978566905657), ('empire', 0.055274112551010335), ('dream', 0.04917325935832957), ('vendor', 0.024276714575283735), ('nightmare', 0.023605168431774765), ('dream market', 0.016025449931173885), ('empire market', 0.014646720705699409), ('new', 0.009033909010090109), ('nightmare market', 0.008867402221856543), ('scam', 0.006303868464254871)]
Topic 2:
[('vendor', 0.09965429794348642), ('scammer', 0.025788920958809015), ('scam', 0.017833603310448354), ('looking', 0.01337570071081538), ('scamming', 0.012208815488636926), ('exit', 0.011806364340026236), ('scammed', 0.008689720115543394), ('uk', 0.008678133768927804), ('good', 0.008493482524539575), ('warning', 0.008418582129949287)]
Topic 3:
[('review', 0.1428141634073404), ('vendor review', 0.058876246025626515), ('vendor', 0.05315846344525214), ('review vendor', 0.021049951157661017), ('review vendor review', 0.017406474951027713), ('review review', 0.015138695407876355), ('mg', 0.012888546716744416), ('mdma', 0.011146461993445255), ('sample', 0.010133356066428198), ('dream', 0.009783289767907996)]
Topic 4:
[('mdma', 0.011231558108969678), ('lsd', 0.009238251834183116), ('good', 0.007359917621616781), ('got', 0.006638868206622288), ('shit', 0.0065802885463340675), ('whats', 0.006051630264178851), ('guy', 0.005697866126116449), ('fuck', 0.005394916465354471), ('going', 0.005375411718474036), ('wsm', 0.0052967375805114646)]
Topic 5:
[('order', 0.09533424569336707), ('package', 0.025076372096897597), ('shipping', 0.02284913659637588), ('delivery', 0.018139605364174704), ('vendor', 0.014195026757439324), ('pack', 0.014024930561711633), ('tracking', 0.012976075064416448), ('shipped', 0.012741042718045418), ('ordering', 0.01153929794529684), ('time', 0.01087192180365464)]
Topic 6:
[('card', 0.04045581193563761), ('bitcoin', 0.03526436871145481), ('wallet', 0.02671909128748556), ('credit', 0.02286661027552805), ('btc', 0.0196385675748142), ('carding', 0.018970779081355412), ('coin', 0.016677548495845462), ('credit card', 0.014601870612078016), ('cash', 0.012420616388040553), ('bank', 0.010979756425111214)]
Topic -1:
[('vendor', 0.013820616851140987), ('new', 0.009152016420677532), ('free', 0.006913858221511509), ('help', 0.006453408973195096), ('best', 0.0060032500179123234), ('account', 0.005801364375676093), ('uk', 0.005664162822486113), ('update', 0.005547486073465391), ('crosspost', 0.005503646525948444), ('need', 0.00541678801673178)]
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_barchart()
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.53)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 27613 | -1_anyone_new_help_free | [anyone, new, help, free, please, update, tick... | [canadianflavor weed shatter cbd edible hash c... |
| 1 | 0 | 8645 | 0_weed_xanax_vendor_cocaine | [weed, xanax, vendor, cocaine, mg, uk, coke, b... | [high quality weed thc product europe, new xan... |
| 2 | 1 | 6236 | 1_market_empire_dream_nightmare | [market, empire, dream, nightmare, vendor, dre... | [next market, dream market vendor rstclass nig... |
| 3 | 2 | 6907 | 2_vendor_scammer_scam_looking | [vendor, scammer, scam, looking, scamming, sal... | [looking good reliable vendor sell ounce, vend... |
| 4 | 3 | 4230 | 3_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [empire vendor cocaine review, first ever revi... |
| 5 | 4 | 6299 | 4_mdma_lsd_get_looking | [mdma, lsd, get, looking, wsm, good, btc, ques... | [hey ro im gon na pull pk, life wonderful life... |
| 6 | 5 | 2776 | 5_order_package_shipping_delivery | [order, package, shipping, delivery, pack, shi... | [order accepted day still hasnt marked shipped... |
| 7 | 6 | 2823 | 6_bitcoin_card_wallet_btc | [bitcoin, card, wallet, btc, bank, credit, car... | [credit cards paypal prepaid card find, got cc... |
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_hierarchy()
topic_model.visualize_topics()
topic_model.visualize_barchart()
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5083789229393005 Davies_bouldin_score: 0.7570962651091117
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(37916, 10)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | Representation | Representative_Docs | UMAP_embedding | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | checks | [0.052164897, 0.029597273, -0.03666609, 0.0051... | 4 | 0.000000 | 2020-01-09 | 6299 | 4_mdma_lsd_get_looking | [mdma, lsd, get, looking, wsm, good, btc, ques... | [hey ro im gon na pull pk, life wonderful life... | [1.6488198, 9.914265, 1.442794, 2.8094368, -0.... |
| 1 | trusted vendor status | [0.02445144, -0.008732641, -0.0050215074, 0.01... | 2 | 0.944247 | 2020-01-09 | 6907 | 2_vendor_scammer_scam_looking | [vendor, scammer, scam, looking, scamming, sal... | [looking good reliable vendor sell ounce, vend... | [2.910516, 10.281041, 1.650234, 3.0320778, -0.... |
| 2 | empire exit scam iiflux user incomming | [0.02890829, 0.036081452, -0.027694924, -0.007... | 1 | 1.000000 | 2019-11-06 | 6236 | 1_market_empire_dream_nightmare | [market, empire, dream, nightmare, vendor, dre... | [next market, dream market vendor rstclass nig... | [1.5884036, 9.8587885, 3.3090453, 2.652358, 2.... |
| 3 | ecstasy vendor packs | [-0.022524439, 0.03949761, -0.023750877, 0.033... | 5 | 0.797741 | 2020-01-09 | 2776 | 5_order_package_shipping_delivery | [order, package, shipping, delivery, pack, shi... | [order accepted day still hasnt marked shipped... | [2.0245404, 10.517631, 2.3443217, 3.7595236, -... |
| 4 | opening bank account person fake id | [-0.029834118, 0.03354508, -0.012210185, -0.02... | 6 | 1.000000 | 2019-11-06 | 2823 | 6_bitcoin_card_wallet_btc | [bitcoin, card, wallet, btc, bank, credit, car... | [credit cards paypal prepaid card find, got cc... | [0.7278271, 9.884823, 1.8116106, 2.9336705, -0... |
topic_model.save("Models/topic_model_0.64SilNew", serialization='pickle')
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_1200cluster_0.64sil_renewout.parquet')
sns.histplot(results_final, x='Topic', discrete=True);
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
500 min cluster size¶
seed_topic_list = [[
'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]
zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]
representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=500, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
language='multilingual',
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
seed_topic_list=seed_topic_list,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
zeroshot_topic_list=zeroshot_topic_list,
zeroshot_min_similarity=.05,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 28000 | -1_mdma_new_link_lsd | [mdma, new, link, lsd, help, free, vendor, nee... | [need high quality fake id check , big thanks ... |
| 1 | 0 | 4930 | 0_xanax_coke_cocaine_ketamine | [xanax, coke, cocaine, ketamine, mg, drug, pil... | [promo sale mg adderall ad xanax mg lsd mdma u... |
| 2 | 1 | 4469 | 1_bitcoin_card_bank_carding | [bitcoin, card, bank, carding, monero, wallet,... | [way cash bank log using btc, send bitcoin get... |
| 3 | 2 | 4227 | 2_dread_sub_lsd_shit | [dread, sub, lsd, shit, mdma, whats, guy, fuck... | [hey guy xangod man, let guy know dread host w... |
| 4 | 3 | 3702 | 3_market_dream_nightmare_dreammarket | [market, dream, nightmare, dreammarket, market... | [not order nightmare market, nightmare market ... |
| 5 | 4 | 3469 | 4_review_vendor_reviews_mg | [review, vendor, reviews, mg, vendymcvendface,... | [thclear ml purple kush vape cart review, vend... |
| 6 | 5 | 3410 | 5_order_package_pack_dispute | [order, package, pack, dispute, delivery, ship... | [package custom month love letter nothing, pac... |
| 7 | 6 | 2700 | 6_vendor_looking_seller_vendors | [vendor, looking, seller, vendors, buyer, lsd,... | [best vendor uk lsd, looking good vendor cc fu... |
| 8 | 7 | 1694 | 7_weed_cannabis_marijuana_hash | [weed, cannabis, marijuana, hash, quality, str... | [hash weed ship eu good vendor also usa, new i... |
| 9 | 8 | 1540 | 8_darknet_dark_web_sentenced | [darknet, dark, web, sentenced, drug, darkweb,... | [tacoma man sentenced four year dealing drugs ... |
| 10 | 9 | 1502 | 9_empire_dispute_deposit_empiremarket | [empire, dispute, deposit, empiremarket, scamm... | [empire next, give me empire, empire anyone else] |
| 11 | 10 | 1475 | 10_account_password_pgp_hacking | [account, password, pgp, hacking, hacked, secu... | [vendor enerygcontrolled hacked ca nt log pass... |
| 12 | 11 | 1314 | 11_tried_anybody_heard_ordered | [tried, anybody, heard, ordered, used, recentl... | [anybody heard pasitheas, anyone order recentl... |
| 13 | 12 | 1031 | 12_scammer_scam_exit_scamming | [scammer, scam, exit, scamming, warning, scamm... | [xangod scammer going exit scam proof, cottage... |
| 14 | 13 | 777 | 13_update_maintenance_updated_upgrade | [update, maintenance, updated, upgrade, vender... | [shipping update, update order, vendor update] |
| 15 | 14 | 681 | 14_ticket_support_deposit_month | [ticket, support, deposit, month, response, an... | [support ticket ticket, please help support ti... |
| 16 | 15 | 608 | 15_sample_samples_free_test | [sample, samples, free, test, testing, lab, te... | [xanax mg shipping free samples, new vendor fr... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5718363523483276 Davies_bouldin_score: 0.6211900149809264
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
unique_labels = np.unique(labels)
cmap = plt.cm.magma
plt.figure(figsize=(10, 5))
scatter = plt.scatter(X[:, 1], X[:, 2], c=labels, cmap=cmap, s=5)
plt.gca().set_aspect('equal', 'datalim')
norm = plt.Normalize(vmin=min(labels), vmax=max(labels))
handles = [plt.Line2D([0], [0], marker='o', color=cmap(norm(label)), linestyle='', markersize=10) for label in unique_labels]
legend_labels = [f'Class {label}' for label in unique_labels]
plt.legend(handles, legend_labels, title="Classes")
plt.colorbar(scatter, ticks=range(len(unique_labels)))
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.6)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23928 | -1_new_vendor_help_uk | [new, vendor, help, uk, need, mdma, best, free... | [need high quality fake id check , big thanks ... |
| 1 | 0 | 5207 | 0_xanax_cocaine_mg_coke | [xanax, cocaine, mg, coke, ketamine, vendor, p... | [promo sale mg adderall ad xanax mg lsd mdma u... |
| 2 | 1 | 4512 | 1_bitcoin_card_bank_carding | [bitcoin, card, bank, carding, monero, wallet,... | [way cash bank log using btc, send bitcoin get... |
| 3 | 2 | 4944 | 2_dread_mdma_lsd_get | [dread, mdma, lsd, get, sub, shit, guy, lookin... | [hey guy xangod man, let guy know dread host w... |
| 4 | 3 | 3801 | 3_market_dream_nightmare_dream market | [market, dream, nightmare, dream market, vendo... | [not order nightmare market, nightmare market ... |
| 5 | 4 | 3706 | 4_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [thclear ml purple kush vape cart review, vend... |
| 6 | 5 | 3434 | 5_order_dispute_pack_package | [order, dispute, pack, package, shipping, deli... | [package custom month love letter nothing, pac... |
| 7 | 6 | 4123 | 6_vendor_vendor vendor_looking_best | [vendor, vendor vendor, looking, best, inquiry... | [best vendor uk lsd, looking good vendor cc fu... |
| 8 | 7 | 1848 | 7_weed_cannabis_uk_weed vendor | [weed, cannabis, uk, weed vendor, vendor, qual... | [hash weed ship eu good vendor also usa, new i... |
| 9 | 8 | 1557 | 8_darknet_dark_dark web_web | [darknet, dark, dark web, web, drug, sentenced... | [tacoma man sentenced four year dealing drugs ... |
| 10 | 9 | 1835 | 9_empire_empire market_market_empire empire | [empire, empire market, market, empire empire,... | [empire next, give me empire, empire anyone else] |
| 11 | 10 | 1542 | 10_account_pgp_password_vendor account | [account, pgp, password, vendor account, crypt... | [vendor enerygcontrolled hacked ca nt log pass... |
| 12 | 11 | 1394 | 11_anyone_has_has anyone_anybody | [anyone, has, has anyone, anybody, tried, anyo... | [anybody heard pasitheas, anyone order recentl... |
| 13 | 12 | 1398 | 12_scammer_scam_exit_scamming | [scammer, scam, exit, scamming, scammed, warni... | [xangod scammer going exit scam proof, cottage... |
| 14 | 13 | 826 | 13_update_maintenance_updated_update update | [update, maintenance, updated, update update, ... | [shipping update, update order, vendor update] |
| 15 | 14 | 682 | 14_ticket_support ticket_support_please | [ticket, support ticket, support, please, depo... | [support ticket ticket, please help support ti... |
| 16 | 15 | 792 | 15_sample_free_free sample_samples | [sample, free, free sample, samples, free samp... | [xanax mg shipping free samples, new vendor fr... |
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_topics()
topic_model.visualize_hierarchy()
topic_model.visualize_barchart(top_n_topics=16)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.49986162781715393 Davies_bouldin_score: 0.7193546666619981
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_500cluster_0.54sil_renewout.parquet')
(41601, 10)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
sns.histplot(results_final, x='Topic', discrete=True);
topic_model.save("Models/topic_model_0.50Sil300", serialization='pickle')
400 all-MiniLM-L6-v2¶
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
66735
model = SentenceTransformer('all-MiniLM-L6-v2')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
2024-06-28 13:20:15,481 - PreProcessingText - INFO - Encoding the corpus. This might take a while. Batches: 100%|██████████| 1024/1024 [08:07<00:00, 2.10it/s]
array([[-0.00200396, 0.06075239, 0.00081512, ..., -0.12558922,
-0.01391758, 0.08301434],
[-0.03566444, -0.06874751, -0.09249493, ..., 0.00164954,
0.000249 , 0.04543216],
[-0.00444046, 0.0166403 , -0.03543836, ..., 0.04193429,
0.03246673, -0.03589631],
...,
[-0.08407424, 0.02657652, 0.06274654, ..., -0.1339224 ,
0.03225619, 0.01105761],
[ 0.00345623, 0.00779206, 0.03771318, ..., -0.07482201,
-0.04800161, -0.06592628],
[-0.10641567, -0.04975084, -0.04948104, ..., 0.0221828 ,
0.02330003, 0.00891378]], dtype=float32)
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=[mmr, kw],
embedding_model=model,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 30941 -1_customer_buy_sale_buyer
1 0 5117 0_vape_shatter_carts_cartridge
2 1 2643 1_login_password_logged_error
3 2 2579 2_coca_opium_cocain_cocacolacompany
4 3 2124 3_xanaxlabs_xanaxlife_xanax_xanaxusa
5 4 1938 4_postal_usps_delivery_postage
6 5 1842 5_darkweb_darknetlive_darknetmarkets_sentenced
7 6 1721 6_empire_empiremarket_empireteam_empiredealer
8 7 1631 7_mdma_mdmamaster_pill_ecstasydata
9 8 1601 8_giftcard_card_giftcards_mastercard
10 9 1502 9_vendor_vendorpro_vendors_vendorbbmc
11 10 1417 10_scamming_scammer_scam_scammers
12 11 1126 11_counterfeiting_passport_counterfeit_fakeid
13 12 1072 12_dreammarket_nightmaremarket_market_dreams
14 13 979 13_lsd_tab_tabs_shrooms
15 14 739 14_monero_coinbase_coin_coins
16 15 676 15_review_reviewing_reviews_reviewer
17 16 674 16_pickledrick_heard_theoutfit_muttznutz
18 17 669 17_market_markets_marketplace_marketing
19 18 626 18_crosspost_deposting_goingpostal_vendors
20 19 603 19_deposit_depositing_deposits_ticket
21 20 573 20_pgpkey_pgp_pgps_pg
22 21 535 21_mod_moderator_dispute_disputes
23 22 450 22_cryptonia_cryptoniausers_cryptonians_cryptn...
24 23 445 23_wsm_wsms_vendorcp_machinerymint
25 24 443 24_ketamine_ketamin_ketamineking_ketaminekings
26 25 434 25_ticket_ticketmaster_ticketw_tickets
27 26 429 26_meth_methbusters_methamphetamine_crystal
Representation \
0 [customer, buy, sale, buyer, service, message,...
1 [vape, shatter, carts, cartridge, ounce, marij...
2 [login, password, logged, error, problem, log,...
3 [coca, opium, cocain, cocacolacompany, coke, c...
4 [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr...
5 [postal, usps, delivery, postage, mail, delive...
6 [darkweb, darknetlive, darknetmarkets, sentenc...
7 [empire, empiremarket, empireteam, empiredeale...
8 [mdma, mdmamaster, pill, ecstasydata, mdmaus, ...
9 [giftcard, card, giftcards, mastercard, cards,...
10 [vendor, vendorpro, vendors, vendorbbmc, vendo...
11 [scamming, scammer, scam, scammers, scammed, s...
12 [counterfeiting, passport, counterfeit, fakeid...
13 [dreammarket, nightmaremarket, market, dreams,...
14 [lsd, tab, tabs, shrooms, acid, blotter, blott...
15 [monero, coinbase, coin, coins, cryptocurrency...
16 [review, reviewing, reviews, reviewer, reviewe...
17 [pickledrick, heard, theoutfit, muttznutz, hou...
18 [market, markets, marketplace, marketing, nonm...
19 [crosspost, deposting, goingpostal, vendors, c...
20 [deposit, depositing, deposits, ticket, deposi...
21 [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ...
22 [mod, moderator, dispute, disputes, disputers,...
23 [cryptonia, cryptoniausers, cryptonians, crypt...
24 [wsm, wsms, vendorcp, machinerymint, wowza, pa...
25 [ketamine, ketamin, ketamineking, ketamineking...
26 [ticket, ticketmaster, ticketw, tickets, suppo...
27 [meth, methbusters, methamphetamine, crystal, ...
Representative_Docs
0 [dutchdrugz updates promo active till market p...
1 [sale girl scout cookie carts strains oz lb us...
2 [hey really could use help advice thanks, erro...
3 [colombian coke brazil ship world wide promoti...
4 [adderall mg ir adderall mg xanax super sale, ...
5 [informed delivery showing package, usa canada...
6 [three student arrested dark web drug traffick...
7 [empire anyone else, empire market back, empir...
8 [sale xtc pill mg mda us ca, uk mdma pill vend...
9 [carding amazon gift card, gift card prepaid d...
10 [nmm giving vendor runaround lying acting shad...
11 [market exit scam next, scam alert ukdrugdeale...
12 [buy counterfeit money real fake document, buy...
13 [dream market still, dream market, eleven drea...
14 [lsd blotter tab ug top quality, point one fre...
15 [looking best safe way buy large amount bitcoi...
16 [needing send sample bar trusted reviewer woul...
17 [anybody heard theoutfit, anybody heard pickle...
18 [market anyone else, market, currently working...
19 [envoy want crosspost, could vendor crosspost,...
20 [missing deposit double deposit please help, a...
21 [pgp public key, market pgp key, find pgp key]
22 [moderator dispute day, moderator please help ...
23 [cryptonia market, market king samsara crypton...
24 [wsm vendor, wsm back, wsm down]
25 [ketamine us, get ketamine, ketamine anyone]
26 [help support ticket please, help support tick...
27 [crystal meth uk, crystal meth, crystal meth v...
Topic 0:
[('vape', 0.4513024), ('shatter', 0.4508166), ('carts', 0.42475972), ('cartridge', 0.4150574), ('ounce', 0.38511506), ('marijuana', 0.3761327), ('cannabis', 0.37473193), ('edibles', 0.36946523), ('weed', 0.35874215), ('cart', 0.3494926)]
Topic 1:
[('login', 0.6874596), ('password', 0.58739483), ('logged', 0.44535103), ('error', 0.39473626), ('problem', 0.38404456), ('log', 0.3703017), ('account', 0.36962464), ('help', 0.36578366), ('trouble', 0.3579351), ('session', 0.34920555)]
Topic 2:
[('coca', 0.5442445), ('opium', 0.5241908), ('cocain', 0.48566723), ('cocacolacompany', 0.47682497), ('coke', 0.4701375), ('cocainehcl', 0.4403491), ('cocaine', 0.43470532), ('heroinfactory', 0.43406424), ('colombian', 0.40406665), ('cokemaster', 0.39702898)]
Topic 3:
[('xanaxlabs', 0.68098766), ('xanaxlife', 0.6694618), ('xanax', 0.64481914), ('xanaxusa', 0.5943617), ('xanaxring', 0.5927005), ('xanaxdepot', 0.5860753), ('xanaxdaddy', 0.57530177), ('xanaxblotters', 0.5676911), ('alprazolam', 0.5388765), ('xanaxinc', 0.5038374)]
Topic 4:
[('postal', 0.5783647), ('usps', 0.5671008), ('delivery', 0.552514), ('postage', 0.5435632), ('mail', 0.4794371), ('deliver', 0.46840727), ('package', 0.4595977), ('shipment', 0.4503156), ('shipping', 0.44325382), ('fedex', 0.44258836)]
Topic 5:
[('darkweb', 0.5460649), ('darknetlive', 0.47999817), ('darknetmarkets', 0.46108282), ('sentenced', 0.4581046), ('darknetmarketsnoobs', 0.4534067), ('darknet', 0.45285586), ('darkbay', 0.45059866), ('darkfail', 0.44140962), ('darkdotfail', 0.42702472), ('darknetaustralia', 0.42165762)]
Topic 6:
[('empire', 0.8657665), ('empiremarket', 0.8325376), ('empireteam', 0.7658358), ('empiredealer', 0.73584473), ('empires', 0.7089321), ('imperial', 0.59743464), ('imperialroyalty', 0.533589), ('market', 0.39446667), ('scammer', 0.3011508), ('nightmare', 0.29797795)]
Topic 7:
[('mdma', 0.57491755), ('mdmamaster', 0.55362886), ('pill', 0.54554516), ('ecstasydata', 0.54158187), ('mdmaus', 0.536477), ('mdacanada', 0.49906433), ('mda', 0.47733676), ('md', 0.47456974), ('ecstasy', 0.46981525), ('mg', 0.45221412)]
Topic 8:
[('giftcard', 0.68464833), ('card', 0.6067195), ('giftcards', 0.60337466), ('mastercard', 0.5686253), ('cards', 0.5325688), ('carding', 0.5214343), ('debit', 0.500812), ('carded', 0.49536285), ('carder', 0.48081687), ('cardable', 0.45047107)]
Topic 9:
[('vendor', 0.6717965), ('vendorpro', 0.64170885), ('vendors', 0.63945156), ('vendorbbmc', 0.6131782), ('vendorshop', 0.5619679), ('supplier', 0.4961744), ('shop', 0.43687624), ('inventory', 0.38063982), ('dealer', 0.37658587), ('trusted', 0.35675985)]
Topic 10:
[('scamming', 0.67339057), ('scammer', 0.64245546), ('scam', 0.6315777), ('scammers', 0.60618246), ('scammed', 0.5859374), ('scams', 0.5844768), ('exit', 0.38286078), ('ukdrugdealer', 0.37872887), ('warning', 0.35860184), ('confirmed', 0.3483911)]
Topic 11:
[('counterfeiting', 0.5351553), ('passport', 0.49532643), ('counterfeit', 0.48550797), ('fakeid', 0.46835682), ('forgery', 0.46821818), ('passports', 0.46553856), ('certificate', 0.46403533), ('fakeids', 0.36332572), ('licenses', 0.3491515), ('citizenship', 0.33687454)]
Topic 12:
[('dreammarket', 0.840524), ('nightmaremarket', 0.7301478), ('market', 0.679103), ('dreams', 0.5537206), ('nightmare', 0.54951864), ('dream', 0.52395815), ('dreaming', 0.51259714), ('nightmares', 0.5112673), ('dreamweaver', 0.4622426), ('deals', 0.4392535)]
Topic 13:
[('lsd', 0.6597349), ('tab', 0.4486916), ('tabs', 0.42244914), ('shrooms', 0.40983063), ('acid', 0.37709463), ('blotter', 0.3619333), ('blotters', 0.34030285), ('microdose', 0.31792137), ('dmt', 0.30784056), ('samspade', 0.306018)]
Topic 14:
[('monero', 0.66440576), ('coinbase', 0.6017641), ('coin', 0.58206344), ('coins', 0.55229485), ('cryptocurrency', 0.54781383), ('crypto', 0.5190888), ('bitcoin', 0.49815544), ('btc', 0.4951193), ('cryptocurrencies', 0.49073264), ('bitcoins', 0.48276216)]
Topic 15:
[('review', 0.7554549), ('reviewing', 0.70764035), ('reviews', 0.67082256), ('reviewer', 0.6707778), ('reviewed', 0.66799235), ('vendor', 0.3507808), ('post', 0.3232708), ('sample', 0.3039448), ('journal', 0.28708428), ('dankservices', 0.2783244)]
Topic 16:
[('pickledrick', 0.49188858), ('heard', 0.45528996), ('theoutfit', 0.4499943), ('muttznutz', 0.40856874), ('houseofdank', 0.38270152), ('purepharm', 0.3821613), ('thecandymanuk', 0.38004813), ('ndduk', 0.3797817), ('uzak', 0.37892848), ('turk', 0.37287065)]
Topic 17:
[('market', 0.9246511), ('markets', 0.82856095), ('marketplace', 0.66924006), ('marketing', 0.64059925), ('nonmarket', 0.63226146), ('undermarket', 0.5758176), ('traderoute', 0.5252505), ('farmersmarket', 0.51230544), ('demand', 0.48939776), ('trade', 0.4373095)]
Topic 18:
[('crosspost', 0.8023433), ('deposting', 0.54462177), ('goingpostal', 0.4369921), ('vendors', 0.3397432), ('courier', 0.31433263), ('tarred', 0.30136013), ('expose', 0.28236645), ('shop', 0.26232204), ('buyers', 0.25981808), ('weareamsterdam', 0.25617945)]
Topic 19:
[('deposit', 0.5940467), ('depositing', 0.54835135), ('deposits', 0.4703769), ('ticket', 0.4124618), ('deposited', 0.37039375), ('transaction', 0.32960162), ('btc', 0.29055083), ('fund', 0.28815228), ('unconfirmed', 0.28022093), ('twice', 0.27061075)]
Topic 20:
[('pgpkey', 0.78953433), ('pgp', 0.64266664), ('pgps', 0.60433674), ('pg', 0.57204497), ('pgc', 0.5202303), ('gnupg', 0.49523085), ('key', 0.4912796), ('gpg', 0.45877883), ('keys', 0.42667422), ('pgplogin', 0.40541986)]
Topic 21:
[('mod', 0.6461178), ('moderator', 0.6455801), ('dispute', 0.63188905), ('disputes', 0.53940743), ('disputers', 0.5393207), ('mods', 0.5271941), ('complaint', 0.47743487), ('modderator', 0.43813834), ('consensus', 0.3737623), ('handled', 0.37211758)]
Topic 22:
[('cryptonia', 0.82683897), ('cryptoniausers', 0.7519192), ('cryptonians', 0.7422215), ('cryptnonia', 0.6530852), ('cryptoni', 0.6209998), ('cryptoice', 0.5572725), ('market', 0.5073216), ('samasara', 0.42220467), ('samsera', 0.42188087), ('samsara', 0.3912958)]
Topic 23:
[('wsm', 0.8689953), ('wsms', 0.6338644), ('vendorcp', 0.41763154), ('machinerymint', 0.36969972), ('wowza', 0.36484522), ('paymwn', 0.32914096), ('maintenance', 0.31149185), ('greennz', 0.3085622), ('bionik', 0.30364022), ('bioniks', 0.30257553)]
Topic 24:
[('ketamine', 0.9532861), ('ketamin', 0.86957943), ('ketamineking', 0.8578399), ('ketaminekings', 0.8378519), ('ketaminehouse', 0.8028732), ('ketamax', 0.69982356), ('ketaconnect', 0.527894), ('tiletamine', 0.5001087), ('pyrimethamine', 0.48265585), ('pharmaceutical', 0.43739906)]
Topic 25:
[('ticket', 0.7282917), ('ticketmaster', 0.6860643), ('ticketw', 0.65911514), ('tickets', 0.62922376), ('support', 0.51385075), ('concert', 0.37351736), ('help', 0.29014573), ('assist', 0.28098187), ('fix', 0.27553594), ('outstanding', 0.27276954)]
Topic 26:
[('meth', 0.7546984), ('methbusters', 0.71206135), ('methamphetamine', 0.6617794), ('crystal', 0.6237694), ('methamph', 0.6163767), ('methoxetamine', 0.6146395), ('methadone', 0.58694017), ('dmethamphetamine', 0.5264992), ('methaqualone', 0.49982086), ('amphetamine', 0.49571955)]
Topic -1:
[('customer', 0.44219303), ('buy', 0.42263174), ('sale', 0.38992852), ('buyer', 0.38299185), ('service', 0.38183293), ('message', 0.37282392), ('update', 0.37055105), ('price', 0.37036857), ('paypal', 0.35097662), ('legit', 0.34381357)]
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 30941 | -1_customer_buy_sale_buyer | [customer, buy, sale, buyer, service, message,... | [dutchdrugz updates promo active till market p... |
| 1 | 0 | 5117 | 0_vape_shatter_carts_cartridge | [vape, shatter, carts, cartridge, ounce, marij... | [sale girl scout cookie carts strains oz lb us... |
| 2 | 1 | 2643 | 1_login_password_logged_error | [login, password, logged, error, problem, log,... | [hey really could use help advice thanks, erro... |
| 3 | 2 | 2579 | 2_coca_opium_cocain_cocacolacompany | [coca, opium, cocain, cocacolacompany, coke, c... | [colombian coke brazil ship world wide promoti... |
| 4 | 3 | 2124 | 3_xanaxlabs_xanaxlife_xanax_xanaxusa | [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr... | [adderall mg ir adderall mg xanax super sale, ... |
| 5 | 4 | 1938 | 4_postal_usps_delivery_postage | [postal, usps, delivery, postage, mail, delive... | [informed delivery showing package, usa canada... |
| 6 | 5 | 1842 | 5_darkweb_darknetlive_darknetmarkets_sentenced | [darkweb, darknetlive, darknetmarkets, sentenc... | [three student arrested dark web drug traffick... |
| 7 | 6 | 1721 | 6_empire_empiremarket_empireteam_empiredealer | [empire, empiremarket, empireteam, empiredeale... | [empire anyone else, empire market back, empir... |
| 8 | 7 | 1631 | 7_mdma_mdmamaster_pill_ecstasydata | [mdma, mdmamaster, pill, ecstasydata, mdmaus, ... | [sale xtc pill mg mda us ca, uk mdma pill vend... |
| 9 | 8 | 1601 | 8_giftcard_card_giftcards_mastercard | [giftcard, card, giftcards, mastercard, cards,... | [carding amazon gift card, gift card prepaid d... |
| 10 | 9 | 1502 | 9_vendor_vendorpro_vendors_vendorbbmc | [vendor, vendorpro, vendors, vendorbbmc, vendo... | [nmm giving vendor runaround lying acting shad... |
| 11 | 10 | 1417 | 10_scamming_scammer_scam_scammers | [scamming, scammer, scam, scammers, scammed, s... | [market exit scam next, scam alert ukdrugdeale... |
| 12 | 11 | 1126 | 11_counterfeiting_passport_counterfeit_fakeid | [counterfeiting, passport, counterfeit, fakeid... | [buy counterfeit money real fake document, buy... |
| 13 | 12 | 1072 | 12_dreammarket_nightmaremarket_market_dreams | [dreammarket, nightmaremarket, market, dreams,... | [dream market still, dream market, eleven drea... |
| 14 | 13 | 979 | 13_lsd_tab_tabs_shrooms | [lsd, tab, tabs, shrooms, acid, blotter, blott... | [lsd blotter tab ug top quality, point one fre... |
| 15 | 14 | 739 | 14_monero_coinbase_coin_coins | [monero, coinbase, coin, coins, cryptocurrency... | [looking best safe way buy large amount bitcoi... |
| 16 | 15 | 676 | 15_review_reviewing_reviews_reviewer | [review, reviewing, reviews, reviewer, reviewe... | [needing send sample bar trusted reviewer woul... |
| 17 | 16 | 674 | 16_pickledrick_heard_theoutfit_muttznutz | [pickledrick, heard, theoutfit, muttznutz, hou... | [anybody heard theoutfit, anybody heard pickle... |
| 18 | 17 | 669 | 17_market_markets_marketplace_marketing | [market, markets, marketplace, marketing, nonm... | [market anyone else, market, currently working... |
| 19 | 18 | 626 | 18_crosspost_deposting_goingpostal_vendors | [crosspost, deposting, goingpostal, vendors, c... | [envoy want crosspost, could vendor crosspost,... |
| 20 | 19 | 603 | 19_deposit_depositing_deposits_ticket | [deposit, depositing, deposits, ticket, deposi... | [missing deposit double deposit please help, a... |
| 21 | 20 | 573 | 20_pgpkey_pgp_pgps_pg | [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ... | [pgp public key, market pgp key, find pgp key] |
| 22 | 21 | 535 | 21_mod_moderator_dispute_disputes | [mod, moderator, dispute, disputes, disputers,... | [moderator dispute day, moderator please help ... |
| 23 | 22 | 450 | 22_cryptonia_cryptoniausers_cryptonians_cryptn... | [cryptonia, cryptoniausers, cryptonians, crypt... | [cryptonia market, market king samsara crypton... |
| 24 | 23 | 445 | 23_wsm_wsms_vendorcp_machinerymint | [wsm, wsms, vendorcp, machinerymint, wowza, pa... | [wsm vendor, wsm back, wsm down] |
| 25 | 24 | 443 | 24_ketamine_ketamin_ketamineking_ketaminekings | [ketamine, ketamin, ketamineking, ketamineking... | [ketamine us, get ketamine, ketamine anyone] |
| 26 | 25 | 434 | 25_ticket_ticketmaster_ticketw_tickets | [ticket, ticketmaster, ticketw, tickets, suppo... | [help support ticket please, help support tick... |
| 27 | 26 | 429 | 26_meth_methbusters_methamphetamine_crystal | [meth, methbusters, methamphetamine, crystal, ... | [crystal meth uk, crystal meth, crystal meth v... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6434006690979004 Davies_bouldin_score: 0.4681034572960446
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.5)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-27 14:34:02,549 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 27323 | -1_anyone_vendor_order_review | [anyone, vendor, order, review, new, get, acco... | [dutchdrugz updates promo active till market p... |
| 1 | 0 | 5137 | 0_weed_cannabis_cart_review | [weed, cannabis, cart, review, thc, vendor, oz... | [sale girl scout cookie carts strains oz lb us... |
| 2 | 1 | 2700 | 1_help_login_need_account | [help, login, need, account, sub, back, passwo... | [hey really could use help advice thanks, erro... |
| 3 | 2 | 2601 | 2_cocaine_coke_heroin_drug | [cocaine, coke, heroin, drug, vendor, uk, best... | [colombian coke brazil ship world wide promoti... |
| 4 | 3 | 2270 | 3_xanax_mg_adderall_alprazolam | [xanax, mg, adderall, alprazolam, bar, diazepa... | [adderall mg ir adderall mg xanax super sale, ... |
| 5 | 4 | 2031 | 4_order_shipping_package_delivery | [order, shipping, package, delivery, shipped, ... | [informed delivery showing package, usa canada... |
| 6 | 5 | 1861 | 5_darknet_dark_tor_web | [darknet, dark, tor, web, onion, dark web, dar... | [three student arrested dark web drug traffick... |
| 7 | 6 | 1826 | 6_empire_empire market_empire empire_market | [empire, empire market, empire empire, market,... | [empire anyone else, empire market back, empir... |
| 8 | 7 | 1653 | 7_mdma_pill_mda_xtc | [mdma, pill, mda, xtc, mdma vendor, mg, usa, p... | [sale xtc pill mg mda us ca, uk mdma pill vend... |
| 9 | 8 | 1628 | 8_card_carding_cc_credit | [card, carding, cc, credit, cvv, credit card, ... | [carding amazon gift card, gift card prepaid d... |
| 10 | 9 | 3010 | 9_vendor_vendor vendor_inquiry_vendor inquiry | [vendor, vendor vendor, inquiry, vendor inquir... | [nmm giving vendor runaround lying acting shad... |
| 11 | 10 | 1741 | 10_scam_scammer_exit_scamming | [scam, scammer, exit, scamming, scammed, exit ... | [market exit scam next, scam alert ukdrugdeale... |
| 12 | 11 | 1147 | 11_counterfeit_id_fake_passport | [counterfeit, id, fake, passport, fake id, not... | [buy counterfeit money real fake document, buy... |
| 13 | 12 | 1202 | 12_dream_nightmare_dream market_market | [dream, nightmare, dream market, market, night... | [dream market still, dream market, eleven drea... |
| 14 | 13 | 1009 | 13_lsd_ug_tab_lsd vendor | [lsd, ug, tab, lsd vendor, acid, free, lsd tab... | [lsd blotter tab ug top quality, point one fre... |
| 15 | 14 | 854 | 14_monero_btc_bitcoin_coin | [monero, btc, bitcoin, coin, crypto, wallet, b... | [looking best safe way buy large amount bitcoi... |
| 16 | 15 | 926 | 15_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [needing send sample bar trusted reviewer woul... |
| 17 | 16 | 681 | 16_heard_anyone_anyone heard_happened | [heard, anyone, anyone heard, happened, has, h... | [anybody heard theoutfit, anybody heard pickle... |
| 18 | 17 | 989 | 17_market_market market_new market_new | [market, market market, new market, new, apoll... | [market anyone else, market, currently working... |
| 19 | 18 | 764 | 18_crosspost_review crosspost_crosspost vendor... | [crosspost, review crosspost, crosspost vendor... | [envoy want crosspost, could vendor crosspost,... |
| 20 | 19 | 671 | 19_deposit_deposited_ticket_address | [deposit, deposited, ticket, address, double, ... | [missing deposit double deposit please help, a... |
| 21 | 20 | 596 | 20_pgp_key_pgp key_public | [pgp, key, pgp key, public, public pgp, messag... | [pgp public key, market pgp key, find pgp key] |
| 22 | 21 | 551 | 21_dispute_dispute dispute_mod_moderator | [dispute, dispute dispute, mod, moderator, ple... | [moderator dispute day, moderator please help ... |
| 23 | 22 | 480 | 22_cryptonia_samsara_samsara market_cryptonia ... | [cryptonia, samsara, samsara market, cryptonia... | [cryptonia market, market king samsara crypton... |
| 24 | 23 | 485 | 23_wsm_wsm wsm_wsm vendor_vendor wsm | [wsm, wsm wsm, wsm vendor, vendor wsm, vendor,... | [wsm vendor, wsm back, wsm down] |
| 25 | 24 | 468 | 24_ketamine_ketamine vendor_mdma ketamine_keta... | [ketamine, ketamine vendor, mdma ketamine, ket... | [ketamine us, get ketamine, ketamine anyone] |
| 26 | 25 | 458 | 25_ticket_support ticket_support_please | [ticket, support ticket, support, please, mont... | [help support ticket please, help support tick... |
| 27 | 26 | 467 | 26_meth_crystal meth_crystal_meth vendor | [meth, crystal meth, crystal, meth vendor, met... | [crystal meth uk, crystal meth, crystal meth v... |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
dict_zero_shots_2 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.2)
dict_zero_shots_17 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.17)
dict_zero_shots_15 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.15)
dict_zero_shots_2[18] = 'crosspost vendor'
dict_zero_shots_2[22] = 'samsara market'
dict_zero_shots_2[23] = 'wsm market'
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_025.csv', index=False)
pd.DataFrame(list(dict_zero_shots_2.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_020.csv', index=False)
pd.DataFrame(list(dict_zero_shots_17.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_017.csv', index=False)
pd.DataFrame(list(dict_zero_shots_15.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_015.csv', index=False)
topic_model.set_topic_labels(dict_zero_shots_2)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True, custom_labels=True)
topic_model.visualize_hierarchy(custom_labels=True)
topic_model.visualize_topics()
topic_model.visualize_barchart(top_n_topics=25, custom_labels=True, n_words=10)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5175204277038574 Davies_bouldin_score: 0.7919422601150089
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:24, 1.62s/it]
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(38274, 11)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | CustomName | Representation | Representative_Docs | UMAP_embedding | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | review empire vendor acidbern | [-0.07762138, -0.049061198, -0.046745114, -0.0... | 6 | 0.527385 | 2020-01-09 | 1826 | 6_empire_empire market_empire empire_market | empire market | [empire, empire market, empire empire, market,... | [empire anyone else, empire market back, empir... | [9.086779, 3.6718397, 8.9006195, -1.1745992, 1... |
| 1 | vendor shipping combine priority | [-0.027722627, -0.0031221025, 0.01195772, -0.0... | 4 | 0.962274 | 2019-11-06 | 2031 | 4_order_shipping_package_delivery | order | [order, shipping, package, delivery, shipped, ... | [informed delivery showing package, usa canada... | [9.679236, 2.7164314, 8.733615, 0.011899776, 8... |
| 2 | open ticket since may ticket | [0.055031013, -0.018210536, -0.0026789573, -0.... | 25 | 1.000000 | 2020-01-09 | 458 | 25_ticket_support ticket_support_please | ticket support - ask help | [ticket, support ticket, support, please, mont... | [help support ticket please, help support tick... | [9.901975, 5.2703958, 11.463735, 0.47217792, 8... |
| 3 | vendor inquiry destroid dream | [-0.023196185, 0.0573189, 0.028408512, -0.0222... | 9 | 0.000000 | 2019-11-06 | 3010 | 9_vendor_vendor vendor_inquiry_vendor inquiry | inquiry - vendor vendor - vendor | [vendor, vendor vendor, inquiry, vendor inquir... | [nmm giving vendor runaround lying acting shad... | [9.912251, 4.028657, 7.623224, -0.7158077, 9.2... |
| 4 | morrison saver stamps uk money maker easiest m... | [-0.020903945, 0.050762244, -0.041445963, 0.01... | 11 | 0.799023 | 2020-01-09 | 1147 | 11_counterfeit_id_fake_passport | counterfeit money - fake IDs | [counterfeit, id, fake, passport, fake id, not... | [buy counterfeit money real fake document, buy... | [9.859931, 3.1459394, 9.145497, -1.0489817, 9.... |
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_400", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_400.parquet')
200 all-MiniLM-L6-v2¶
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=200, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=[mmr, kw],
embedding_model=model,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23926 | -1_opiateconnect_heinekenexpress_buy_dmt | [opiateconnect, heinekenexpress, buy, dmt, sho... | [good source dmt dream market, tramadol mg mg ... |
| 1 | 0 | 5394 | 0_carts_vape_cart_cannabis | [carts, vape, cart, cannabis, marijuana, straw... | [mg thc gummies cherry raspberry strawberry fl... |
| 2 | 1 | 2651 | 1_delivery_shipment_postage_delivered | [delivery, shipment, postage, delivered, posta... | [package say delivered po box never got, mail ... |
| 3 | 2 | 2204 | 2_vendor_vendors_vendorbbmc_trusted | [vendor, vendors, vendorbbmc, trusted, supplie... | [im looking vendor named buths bhuts shipping ... |
| 4 | 3 | 1951 | 3_guy_post_community_sub | [guy, post, community, sub, idea, posting, nig... | [sup fam ya boy ng min anybody legit right, hi... |
| 5 | 4 | 1715 | 4_empiremarket_empire_empiredealer_empireteam | [empiremarket, empire, empiredealer, empiretea... | [empire vendor, give me empire, top empire ven... |
| 6 | 5 | 1694 | 5_dreammarket_market_markets_nightmaremarket | [dreammarket, market, markets, nightmaremarket... | [miss dream ca nt use wallstreet market, wall ... |
| 7 | 6 | 1550 | 6_scamming_scammer_scam_scammers | [scamming, scammer, scam, scammers, scams, sca... | [cottageindustry possibly exit scamming select... |
| 8 | 7 | 1411 | 7_darkweb_sentenced_darknetmarketsnoobs_darkne... | [darkweb, sentenced, darknetmarketsnoobs, dark... | [darkweb vendor happytimes sentenced five year... |
| 9 | 8 | 1390 | 8_coca_cocain_cocacolacompany_cocainehcl | [coca, cocain, cocacolacompany, cocainehcl, co... | [review high purity colombian coke brazil, ful... |
| 10 | 9 | 1240 | 9_xanaxlabs_xanaxlife_xanax_xanaxcartel | [xanaxlabs, xanaxlife, xanax, xanaxcartel, xan... | [frankie new vendor mg real alprazolam xanax b... |
| 11 | 10 | 1130 | 10_mdma_mdmaus_mda_mdmamaster | [mdma, mdmaus, mda, mdmamaster, mdmamphetamine... | [per gram high purity mda promotion active sel... |
| 12 | 11 | 997 | 11_lsd_shrooms_tab_acid | [lsd, shrooms, tab, acid, tabs, psychedelic, m... | [point one lsd blotters lsd tab void realm tea... |
| 13 | 12 | 911 | 12_det_dere_igjen_en | [det, dere, igjen, en, privnote, kan, esrar, s... | [lever fortsatt valhalla noen som har en invit... |
| 14 | 13 | 813 | 13_monero_moneroatms_wallet_coin | [monero, moneroatms, wallet, coin, bitcoin, cr... | [buying coin anonymously needed monero, noob n... |
| 15 | 14 | 799 | 14_mastercard_card_usacards_cards | [mastercard, card, usacards, cards, carding, p... | [buying prepaid debit card btc eu, online card... |
| 16 | 15 | 687 | 15_tor_torguard_torbox_vpn | [tor, torguard, torbox, vpn, torstreet, vpns, ... | [configure tor browser disable javascript, use... |
| 17 | 16 | 659 | 16_crosspost_crossposting_goingpostal_crosspdf | [crosspost, crossposting, goingpostal, crosspd... | [lesson learnd googleplex saga prolific bar de... |
| 18 | 17 | 658 | 17_review_reviews_reviewer_reviewing | [review, reviews, reviewer, reviewing, reviewe... | [xpost danknation vendor review sunaero multis... |
| 19 | 18 | 636 | 18_marketplace_coremarket_market_markets | [marketplace, coremarket, market, markets, non... | [none marketplace link working, currently work... |
| 20 | 19 | 584 | 19_moderator_mod_dispute_disputee | [moderator, mod, dispute, disputee, disputers,... | [mod admin help dispute, dispute moderator ple... |
| 21 | 20 | 569 | 20_pgpkey_pgp_pgps_pg | [pgpkey, pgp, pgps, pg, key, gnupg, gpg, keys,... | [pgp public key, market pgp key, pgp key] |
| 22 | 21 | 568 | 21_deposit_depositing_deposits_deposited | [deposit, depositing, deposits, deposited, add... | [btc deposit issue ticket, missing deposit dou... |
| 23 | 22 | 539 | 22_passport_passports_fakeid_certificate | [passport, passports, fakeid, certificate, for... | [photoshop documents fakeid photo id address p... |
| 24 | 23 | 478 | 23_cryptonia_cryptoniausers_cryptonians_cryptn... | [cryptonia, cryptoniausers, cryptonians, crypt... | [cryptonia market, market king samsara crypton... |
| 25 | 24 | 468 | 24_wsm_wkr_whita_terpwax | [wsm, wkr, whita, terpwax, whachu, wowza, gree... | [back me wsm, wsm back, wsm vendor] |
| 26 | 25 | 447 | 25_bunk_bar_bars_selaminy | [bunk, bar, bars, selaminy, thegeniusbar, bars... | [selaminy bar review, bunk pack selaminy, sela... |
| 27 | 26 | 444 | 26_meth_methbusters_methamphetamine_methamph | [meth, methbusters, methamphetamine, methamph,... | [looking crystal meth, crystal meth uk, crysta... |
| 28 | 27 | 443 | 27_ketamine_ketamin_ketamineking_ketaminekings | [ketamine, ketamin, ketamineking, ketamineking... | [ketamine uk vendor, review ketamine, ketamine... |
| 29 | 28 | 438 | 28_ticket_ticketmaster_ticketing_ticketw | [ticket, ticketmaster, ticketing, ticketw, tic... | [support ticket open month, support ticket tic... |
| 30 | 29 | 416 | 29_counterfeitmoney_counterfeit_counterfeits_c... | [counterfeitmoney, counterfeit, counterfeits, ... | [find best usd counterfeit note, best counterf... |
| 31 | 30 | 415 | 30_login_logins_password_authentication | [login, logins, password, authentication, mult... | [password login disabled, login problem fa err... |
| 32 | 31 | 409 | 31_ecstasy_ecstasydata_pill_pillsexpress | [ecstasy, ecstasydata, pill, pillsexpress, pil... | [best ecstasy pill, samsung mg ecstasy pills u... |
| 33 | 32 | 409 | 32_hacking_hacker_hackerforhire_hackers | [hacking, hacker, hackerforhire, hackers, hack... | [job btc hacking service needed, looking profe... |
| 34 | 33 | 401 | 33_adderall_adderalls_adderal_adderallz | [adderall, adderalls, adderal, adderallz, adde... | [back mg adderall ir straight pharmacy brand n... |
| 35 | 34 | 392 | 34_tails_tail_wallet_monero | [tails, tail, wallet, monero, electrum, electr... | [electrum tail personal monero wallet, tails e... |
| 36 | 35 | 376 | 35_mushrooms_mushroommafia_mushroom_mushroomchick | [mushrooms, mushroommafia, mushroom, mushroomc... | [mushcanada free sample grams psilocybe cubens... |
| 37 | 36 | 369 | 36_xmr_xmrs_btc_lfwxmr | [xmr, xmrs, btc, lfwxmr, xmrto, btcoin, xmrtop... | [xmr btc empire, btc xmr, xmr btc xmr xmr] |
| 38 | 37 | 349 | 37_drugmarket_drugpics_drugs_drugsource | [drugmarket, drugpics, drugs, drugsource, drug... | [energy control international use and abuse of... |
| 39 | 38 | 344 | 38_dread_dreade_dreaddit_dreaddits | [dread, dreade, dreaddit, dreaddits, dreadadve... | [new dread since dream, dread back, dread well] |
| 40 | 39 | 315 | 39_withdraw_withdrawling_withdrawing_withdrawled | [withdraw, withdrawling, withdrawing, withdraw... | [made withdraw btc, withdrawal working stuck p... |
| 41 | 40 | 311 | 40_escrow_escrows_payment_multisignature | [escrow, escrows, payment, multisignature, mar... | [escrow, much escrow, full escrow] |
| 42 | 41 | 302 | 41_heroin_opium_heroinreview_heroinfactory | [heroin, opium, heroinreview, heroinfactory, h... | [liquidgold afghan burmese heroin sale extende... |
| 43 | 42 | 300 | 42_oxycodone_oxycocodone_oxicodone_oxycodon | [oxycodone, oxycocodone, oxicodone, oxycodon, ... | [mg oxycodone instant release supeudol origina... |
| 44 | 43 | 285 | 43_dnm_dmn_dnms_dnmrelated | [dnm, dmn, dnms, dnmrelated, dm, dwm, dnmsuper... | [dnm avenger link, new dnm order, call dnm ven... |
| 45 | 44 | 266 | 44_paypal_paypalshow_paypals_transfers | [paypal, paypalshow, paypals, transfers, trans... | [looking legit website bank western union payp... |
| 46 | 45 | 262 | 45_ddos_ddosd_attacks_attack | [ddos, ddosd, attacks, attack, ddosed, attacke... | [new ddos attack, attack ddos, ddos attack] |
| 47 | 46 | 252 | 46_fraud_fraudsters_fraudster_frauding | [fraud, fraudsters, fraudster, frauding, fraud... | [new fraud vendor, fraud vendor, fraud] |
| 48 | 47 | 247 | 47_benzoblotters_benzobuddies_benzos_benzo | [benzoblotters, benzobuddies, benzos, benzo, b... | [czech republic worldwide discreetlab selling ... |
| 49 | 48 | 231 | 48_apollonmarket_apollon_market_apollo | [apollonmarket, apollon, market, apollo, myste... | [mysteryland apollon market big promotion deal... |
| 50 | 49 | 230 | 49_phishing_phising_phish_phishy | [phishing, phising, phish, phishy, phissing, p... | [phishing warning, phishing link, warning empi... |
| 51 | 50 | 217 | 50_opsec_opsexy_opec_opspec | [opsec, opsexy, opec, opspec, opsecaholic, net... | [dream opsec, opsec, opsec question] |
| 52 | 51 | 213 | 51_mirror_mirrors_reflection_links | [mirror, mirrors, reflection, links, url, link... | [mirror link working, anyone working mirror li... |
| 53 | 52 | 212 | 52_links_link_pm_works | [links, link, pm, works, need, url, send, work... | [please pm someone working link, someone pm wo... |
| 54 | 53 | 207 | 53_fentanyl_fentantyl_carfentanyl_carfentanil | [fentanyl, fentantyl, carfentanyl, carfentanil... | [furanyl fentanyl fentanyl analogue eu, lookin... |
| 55 | 54 | 203 | 54_cgmc_invitation_ggmc_invite | [cgmc, invitation, ggmc, invite, cmc, gcmc, co... | [need cgmc invite code, cgmc invite code, invi... |
| 56 | 55 | 202 | 55_cvv_cvvs_ccv_cvvbilling | [cvv, cvvs, ccv, cvvbilling, cmv, ccs, vcc, cc... | [looking trusted cc cvv vendor, uk cc cvv vend... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6660425662994385 Davies_bouldin_score: 0.3869296287979983
topic_model.reduce_topics(tc1.corpus, nr_topics='auto')
topics = topic_model.topics_
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23926 | -1_heinekenexpress_dmt_opiateconnect_tramadol | [heinekenexpress, dmt, opiateconnect, tramadol... | [good source dmt dream market, need know start... |
| 1 | 0 | 11635 | 0_cannabis_sale_edibles_price | [cannabis, sale, edibles, price, weed, shippin... | [adderall mg pharma gram aaa indoor nugs ounce... |
| 2 | 1 | 4393 | 1_scamming_scammer_scam_scammed | [scamming, scammer, scam, scammed, scams, phis... | [sale customer scamming alert vendor, partysqu... |
| 3 | 2 | 2651 | 2_delivery_package_shipment_postage | [delivery, package, shipment, postage, shippin... | [package marked delivered never arrived, packa... |
| 4 | 3 | 1951 | 3_post_posting_advice_community | [post, posting, advice, community, newbie, que... | [sup fam ya boy ng min anybody legit right, hi... |
| 5 | 4 | 1715 | 4_empire_empiremarket_empiredealer_empireteam | [empire, empiremarket, empiredealer, empiretea... | [empire deposit support, empire now back, empi... |
| 6 | 5 | 1694 | 5_dreammarket_dream_dreams_dreaming | [dreammarket, dream, dreams, dreaming, nightma... | [new wall st use dream quick question, dream m... |
| 7 | 6 | 1411 | 6_darkweb_darkbay_darknetmarkets_sentenced | [darkweb, darkbay, darknetmarkets, sentenced, ... | [father son sentenced prison selling drugs dar... |
| 8 | 7 | 1130 | 7_mdma_mdmamaster_mdmaus_mda | [mdma, mdmamaster, mdmaus, mda, mdmamphetamine... | [best domestic mdma mda fast shipping tracked ... |
| 9 | 8 | 911 | 8_det_dere_je_nede | [det, dere, je, nede, noen, du, igjen, vous, s... | [hejlpe til finne ut av hva jeg har mottatt, z... |
| 10 | 9 | 813 | 9_monero_moneroatms_wallet_bitcoin | [monero, moneroatms, wallet, bitcoin, crypto, ... | [monero btc, noob need help buying bitcoin mon... |
| 11 | 10 | 799 | 10_mastercard_card_carder_carding | [mastercard, card, carder, carding, cards, car... | [credit score balance hq debit card fullz appl... |
| 12 | 11 | 687 | 11_tor_torguard_vpn_torbox | [tor, torguard, vpn, torbox, vpns, torshops, t... | [really safe using tor vpn, use vpn tor tails,... |
| 13 | 12 | 659 | 12_crosspost_goingpostal_posted_marketplace | [crosspost, goingpostal, posted, marketplace, ... | [someone posted witchman account crosspost, ma... |
| 14 | 13 | 658 | 13_review_reviewing_reviews_reviewed | [review, reviewing, reviews, reviewed, reviewf... | [review please, xpost danknation vendor review... |
| 15 | 14 | 636 | 14_coremarket_marketplace_markets_market | [coremarket, marketplace, markets, market, non... | [none marketplace link working, core marketpla... |
| 16 | 15 | 584 | 15_dispute_moderator_disputes_disputers | [dispute, moderator, disputes, disputers, mod,... | [moderator please help dispute, dispute modera... |
| 17 | 16 | 569 | 16_pgpkey_pgp_pgps_pg | [pgpkey, pgp, pgps, pg, key, gnupg, keys, gpg,... | [find pgp key, pgp key, vendor pgp key] |
| 18 | 17 | 568 | 17_deposit_depositing_deposits_deposited | [deposit, depositing, deposits, deposited, btc... | [generated deposit address deposited multiple ... |
| 19 | 18 | 539 | 18_passport_passports_fakeid_certificate | [passport, passports, fakeid, certificate, for... | [photoshop documents fakeid photo id address p... |
| 20 | 19 | 478 | 19_cryptonia_cryptonians_cryptoniausers_cryptn... | [cryptonia, cryptonians, cryptoniausers, crypt... | [cryptonia already, everyone move cryptonia ma... |
| 21 | 20 | 468 | 20_wsm_wsms_wkr_wxtra | [wsm, wsms, wkr, wxtra, whita, terpwax, whachu... | [back me wsm, wsm back, wsm vendor] |
| 22 | 21 | 447 | 21_bunk_bars_bar_barsbaby | [bunk, bars, bar, barsbaby, lonestarbars, theg... | [bunk pack selaminy, bunk bar, selaminy hulk b... |
| 23 | 22 | 443 | 22_ketamine_ketamineking_ketamin_ketaminekings | [ketamine, ketamineking, ketamin, ketamineking... | [review ketamine, ketamine review, ketamine us] |
| 24 | 23 | 438 | 23_ticket_ticketmaster_tickets_support | [ticket, ticketmaster, tickets, support, conce... | [support ticket support ticket, support ticket... |
| 25 | 24 | 416 | 24_counterfeit_counterfeitmoney_counterfeits_c... | [counterfeit, counterfeitmoney, counterfeits, ... | [counterfeit note, find best usd counterfeit n... |
| 26 | 25 | 415 | 25_login_logins_password_authentication | [login, logins, password, authentication, logg... | [password changed lost ca nt log, login proble... |
| 27 | 26 | 392 | 26_tails_tail_electrum_electrumtails | [tails, tail, electrum, electrumtails, electru... | [updated tail electrum issue setting gui moner... |
| 28 | 27 | 376 | 27_mushrooms_mushroommafia_mushroom_shrooms | [mushrooms, mushroommafia, mushroom, shrooms, ... | [mushcanada free sample grams psilocybe cubens... |
| 29 | 28 | 369 | 28_xmr_xmrs_lfwxmr_xmrto | [xmr, xmrs, lfwxmr, xmrto, btc, xmrtopy, xanxa... | [btc xmr, xmr btc, xmr btc xmr xmr] |
| 30 | 29 | 344 | 29_dread_dreaddit_dreaddits_dreadonion | [dread, dreaddit, dreaddits, dreadonion, dread... | [dread back, anything dread, dread well] |
| 31 | 30 | 315 | 30_withdraw_withdrawling_withdrawl_withdrawing | [withdraw, withdrawling, withdrawl, withdrawin... | [withdraw problem pending withdraw hour, withd... |
| 32 | 31 | 311 | 31_escrow_escrows_marketplace_payment | [escrow, escrows, marketplace, payment, commis... | [full escrow, escrow first, multisig escrow qu... |
| 33 | 32 | 285 | 32_dnm_dmn_dnms_dnmrelated | [dnm, dmn, dnms, dnmrelated, dm, dwm, dnmarket... | [new dnm first order question, dnm avenger lin... |
| 34 | 33 | 266 | 33_paypal_paypalshow_paypals_transfers | [paypal, paypalshow, paypals, transfers, trans... | [looking legit website bank western union payp... |
| 35 | 34 | 262 | 34_ddos_ddosd_attacks_attack | [ddos, ddosd, attacks, attack, ddosed, attacke... | [anything new nightmare ddos attack, ddos atta... |
| 36 | 35 | 252 | 35_fraud_fraudsters_fraudster_frauding | [fraud, fraudsters, fraudster, frauding, fraud... | [new fraud vendor, fraud vendor, fraud] |
| 37 | 36 | 247 | 36_benzoblotters_benzobuddies_benzos_benzo | [benzoblotters, benzobuddies, benzos, benzo, b... | [czech republic worldwide discreetlab selling ... |
| 38 | 37 | 231 | 37_apollonmarket_apollon_market_apollo | [apollonmarket, apollon, market, apollo, myste... | [mysteryland apollon market big promotion deal... |
| 39 | 38 | 217 | 38_opsec_opsexy_opspec_opec | [opsec, opsexy, opspec, opec, opsecaholic, net... | [dream opsec, opsec question, opsec] |
| 40 | 39 | 213 | 39_mirror_mirrors_empire_reflection | [mirror, mirrors, empire, reflection, working,... | [empire mirror working, anyone working mirror ... |
| 41 | 40 | 212 | 40_links_link_pm_works | [links, link, pm, works, need, url, working, s... | [working link please pm, please pm someone wor... |
| 42 | 41 | 203 | 41_cgmc_invitation_ggmc_invite | [cgmc, invitation, ggmc, invite, cmc, gcmc, co... | [need cgmc invite code, invite code cgmc, cgmc... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.34653472900390625 Davies_bouldin_score: 0.7209094786047956
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.visualize_topics()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.6)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-28 14:20:51,371 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23556 | -1_vendor_anyone_review_new | [vendor, anyone, review, new, account, order, ... | [good source dmt dream market, need know start... |
| 1 | 0 | 11636 | 0_weed_xanax_lsd_review | [weed, xanax, lsd, review, cocaine, mg, vendor... | [adderall mg pharma gram aaa indoor nugs ounce... |
| 2 | 1 | 4422 | 1_vendor_scammer_scam_scamming | [vendor, scammer, scam, scamming, exit, phishi... | [sale customer scamming alert vendor, partysqu... |
| 3 | 2 | 2655 | 2_order_shipping_pack_package | [order, shipping, pack, package, delivery, shi... | [package marked delivered never arrived, packa... |
| 4 | 3 | 1952 | 3_help_guy_need_back | [help, guy, need, back, day, time, question, a... | [sup fam ya boy ng min anybody legit right, hi... |
| 5 | 4 | 1734 | 4_empire_empire market_empire empire_market | [empire, empire market, empire empire, market,... | [empire deposit support, empire now back, empi... |
| 6 | 5 | 1696 | 5_dream_dream market_nightmare_market | [dream, dream market, nightmare, market, walls... | [new wall st use dream quick question, dream m... |
| 7 | 6 | 1411 | 6_darknet_dark_web_dark web | [darknet, dark, web, dark web, darkfail, sente... | [father son sentenced prison selling drugs dar... |
| 8 | 7 | 1151 | 7_mdma_mdma vendor_mda_usa | [mdma, mdma vendor, mda, usa, sale, mdma revie... | [best domestic mdma mda fast shipping tracked ... |
| 9 | 8 | 911 | 8_anyone_heard_happened_de | [anyone, heard, happened, de, anyone heard, ha... | [hejlpe til finne ut av hva jeg har mottatt, z... |
| 10 | 9 | 824 | 9_monero_bitcoin_btc_coin | [monero, bitcoin, btc, coin, wallet, crypto, b... | [monero btc, noob need help buying bitcoin mon... |
| 11 | 10 | 816 | 10_carding_card_credit_credit card | [carding, card, credit, credit card, debit, pr... | [credit score balance hq debit card fullz appl... |
| 12 | 11 | 687 | 11_onion_tor_vpn_javascript | [onion, tor, vpn, javascript, browser, tor bro... | [really safe using tor vpn, use vpn tor tails,... |
| 13 | 12 | 683 | 12_crosspost_review crosspost_giveaway_review | [crosspost, review crosspost, giveaway, review... | [someone posted witchman account crosspost, ma... |
| 14 | 13 | 706 | 13_review_vendor review_review vendor_vendor | [review, vendor review, review vendor, vendor,... | [review please, xpost danknation vendor review... |
| 15 | 14 | 697 | 14_market_market market_new market_marketplace | [market, market market, new market, marketplac... | [none marketplace link working, core marketpla... |
| 16 | 15 | 585 | 15_dispute_moderator_mod_dispute dispute | [dispute, moderator, mod, dispute dispute, ple... | [moderator please help dispute, dispute modera... |
| 17 | 16 | 573 | 16_pgp_key_pgp key_public | [pgp, key, pgp key, public, public pgp, lost, ... | [find pgp key, pgp key, vendor pgp key] |
| 18 | 17 | 578 | 17_deposit_deposited_address_btc | [deposit, deposited, address, btc, btc deposit... | [generated deposit address deposited multiple ... |
| 19 | 18 | 540 | 18_id_fake_passport_fake id | [id, fake, passport, fake id, license, scan, d... | [photoshop documents fakeid photo id address p... |
| 20 | 19 | 482 | 19_cryptonia_samsara_samsara market_cryptonia ... | [cryptonia, samsara, samsara market, cryptonia... | [cryptonia already, everyone move cryptonia ma... |
| 21 | 20 | 485 | 20_wsm_wsm wsm_wsm vendor_vendor wsm | [wsm, wsm wsm, wsm vendor, vendor wsm, vendor,... | [back me wsm, wsm back, wsm vendor] |
| 22 | 21 | 449 | 21_bar_bunk_selaminy_bars | [bar, bunk, selaminy, bars, hulk, bunk bar, th... | [bunk pack selaminy, bunk bar, selaminy hulk b... |
| 23 | 22 | 445 | 22_ketamine_ketamine vendor_ketamine review_re... | [ketamine, ketamine vendor, ketamine review, r... | [review ketamine, ketamine review, ketamine us] |
| 24 | 23 | 440 | 23_ticket_support ticket_support_please | [ticket, support ticket, support, please, mont... | [support ticket support ticket, support ticket... |
| 25 | 24 | 430 | 24_counterfeit_euro_note_counterfeit euro | [counterfeit, euro, note, counterfeit euro, co... | [counterfeit note, find best usd counterfeit n... |
| 26 | 25 | 418 | 25_login_account_password_log | [login, account, password, log, fa, error, ca ... | [password changed lost ca nt log, login proble... |
| 27 | 26 | 393 | 26_tails_tail_electrum_wallet | [tails, tail, electrum, wallet, whonix, monero... | [updated tail electrum issue setting gui moner... |
| 28 | 27 | 377 | 27_mushroom_shrooms_mushrooms_magic | [mushroom, shrooms, mushrooms, magic, cubensis... | [mushcanada free sample grams psilocybe cubens... |
| 29 | 28 | 379 | 28_xmr_btc xmr_btc_xmrto | [xmr, btc xmr, btc, xmrto, xmr btc, xmr deposi... | [btc xmr, xmr btc, xmr btc xmr xmr] |
| 30 | 29 | 348 | 29_dread_dread dread_sub dread_new dread | [dread, dread dread, sub dread, new dread, sub... | [dread back, anything dread, dread well] |
| 31 | 30 | 325 | 30_withdraw_withdrawal_withdrawl_working | [withdraw, withdrawal, withdrawl, working, btc... | [withdraw problem pending withdraw hour, withd... |
| 32 | 31 | 320 | 31_escrow_multisig_full escrow_extend | [escrow, multisig, full escrow, extend, extend... | [full escrow, escrow first, multisig escrow qu... |
| 33 | 32 | 290 | 32_dnm_dnms_dn_dnstars | [dnm, dnms, dn, dnstars, dnmuk, avenger, dm, d... | [new dnm first order question, dnm avenger lin... |
| 34 | 33 | 271 | 33_paypal_transfer_paypal transfer_paypal account | [paypal, transfer, paypal transfer, paypal acc... | [looking legit website bank western union payp... |
| 35 | 34 | 264 | 34_ddos_ddos attack_attack_ddos ddos | [ddos, ddos attack, attack, ddos ddos, market,... | [anything new nightmare ddos attack, ddos atta... |
| 36 | 35 | 262 | 35_fraud_fraudsters_fraud vendor_loan fraud | [fraud, fraudsters, fraud vendor, loan fraud, ... | [new fraud vendor, fraud vendor, fraud] |
| 37 | 36 | 256 | 36_benzos_benzo_rc_benzo vendor | [benzos, benzo, rc, benzo vendor, rc benzos, r... | [czech republic worldwide discreetlab selling ... |
| 38 | 37 | 232 | 37_apollon_apollon market_market_mysteryland | [apollon, apollon market, market, mysteryland,... | [mysteryland apollon market big promotion deal... |
| 39 | 38 | 219 | 38_opsec_opsec question_opsec opsec_question | [opsec, opsec question, opsec opsec, question,... | [dream opsec, opsec question, opsec] |
| 40 | 39 | 215 | 39_mirror_working mirror_working_mirror link | [mirror, working mirror, working, mirror link,... | [empire mirror working, anyone working mirror ... |
| 41 | 40 | 213 | 40_link_working link_working_pm | [link, working link, working, pm, link please,... | [working link please pm, please pm someone wor... |
| 42 | 41 | 203 | 41_cgmc_invite_invite code_code | [cgmc, invite, invite code, code, cgmc invite,... | [need cgmc invite code, invite code cgmc, cgmc... |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
dict_zero_shots_2 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.2)
dict_zero_shots_25[1] = 'phishing - scamming'
dict_zero_shots_25[12] = 'crosspost vendor'
dict_zero_shots_25[19] = 'cryphtonia market'
dict_zero_shots_25[20] = 'wsm market'
dict_zero_shots_25[21] = 'bunk bar'
dict_zero_shots_25[31] = 'escrow service'
dict_zero_shots_25[39] = 'mirror link'
dict_zero_shots_25[40] = 'link'
dict_zero_shots_25[41] = 'cmgc - invite'
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_200/zero_shot_025.csv', index=False)
pd.DataFrame(list(dict_zero_shots_2.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_200/zero_shot_020.csv', index=False)
topic_model.set_topic_labels(dict_zero_shots_25)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True, custom_labels=True)
topic_model.visualize_barchart(top_n_topics=42, custom_labels=True, n_words=10)
topic_model.visualize_topics()
topic_model.visualize_hierarchy(custom_labels=True)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.34653472900390625 Davies_bouldin_score: 0.7209094786047956
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
65529
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:25, 1.72s/it]
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
print(results_final.shape)
results_final.head()
(41973, 10)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | CustomName | Representation | Representative_Docs | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | retirement sale one last blowout mdma dry spee... | [-0.00200396, 0.060752388, 0.00081512495, -0.0... | 7 | 0.393620 | 2020-01-09 | 1151 | 7_mdma_mdma vendor_mda_usa | mdma - reviews vendor | [mdma, mdma vendor, mda, usa, sale, mdma revie... | [best domestic mdma mda fast shipping tracked ... |
| 1 | cash deposit | [-0.0044404618, 0.016640304, -0.035438363, 0.0... | 17 | 0.539291 | 2019-11-06 | 578 | 17_deposit_deposited_address_btc | deposit - address - deposited | [deposit, deposited, address, btc, btc deposit... | [generated deposit address deposited multiple ... |
| 2 | import meth contact tracking | [-0.05514505, -0.042183764, -0.060674116, -0.0... | 0 | 1.000000 | 2020-01-09 | 11636 | 0_weed_xanax_lsd_review | xanax - lsd - weed | [weed, xanax, lsd, review, cocaine, mg, vendor... | [adderall mg pharma gram aaa indoor nugs ounce... |
| 3 | please need working links | [0.013639548, -0.030973928, -0.05787297, 0.026... | 40 | 1.000000 | 2020-01-09 | 213 | 40_link_working link_working_pm | link | [link, working link, working, pm, link please,... | [working link please pm, please pm someone wor... |
| 4 | reliable dexedrine vendor | [-0.09150407, -0.024179617, 0.027147656, -0.06... | 0 | 0.404354 | 2020-01-09 | 11636 | 0_weed_xanax_lsd_review | xanax - lsd - weed | [weed, xanax, lsd, review, cocaine, mg, vendor... | [adderall mg pharma gram aaa indoor nugs ounce... |
plt.figure(figsize=(10, 5))
sns.countplot(results_final, x='Topic', orient='h');
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_200", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
2024-06-28 16:03:43,246 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_200_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=model)
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_200.parquet')
import nbconvert
!jupyter nbconvert --to html show_results.ipynb